{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Portuguese Bank Marketing Strategy- TPOT Tutorial"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be ('yes') or not ('no') subscribed.\n",
    "https://archive.ics.uci.edu/ml/datasets/Bank+Marketing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "source": [
    "# Import required libraries\n",
    "from tpot import TPOTClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "import pandas as pd \n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>job</th>\n",
       "      <th>marital</th>\n",
       "      <th>education</th>\n",
       "      <th>default</th>\n",
       "      <th>housing</th>\n",
       "      <th>loan</th>\n",
       "      <th>contact</th>\n",
       "      <th>month</th>\n",
       "      <th>day_of_week</th>\n",
       "      <th>...</th>\n",
       "      <th>campaign</th>\n",
       "      <th>pdays</th>\n",
       "      <th>previous</th>\n",
       "      <th>poutcome</th>\n",
       "      <th>emp.var.rate</th>\n",
       "      <th>cons.price.idx</th>\n",
       "      <th>cons.conf.idx</th>\n",
       "      <th>euribor3m</th>\n",
       "      <th>nr.employed</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>56</td>\n",
       "      <td>housemaid</td>\n",
       "      <td>married</td>\n",
       "      <td>basic.4y</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>telephone</td>\n",
       "      <td>may</td>\n",
       "      <td>mon</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>999</td>\n",
       "      <td>0</td>\n",
       "      <td>nonexistent</td>\n",
       "      <td>1.1</td>\n",
       "      <td>93.994</td>\n",
       "      <td>-36.4</td>\n",
       "      <td>4.857</td>\n",
       "      <td>5191.0</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>57</td>\n",
       "      <td>services</td>\n",
       "      <td>married</td>\n",
       "      <td>high.school</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>telephone</td>\n",
       "      <td>may</td>\n",
       "      <td>mon</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>999</td>\n",
       "      <td>0</td>\n",
       "      <td>nonexistent</td>\n",
       "      <td>1.1</td>\n",
       "      <td>93.994</td>\n",
       "      <td>-36.4</td>\n",
       "      <td>4.857</td>\n",
       "      <td>5191.0</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>37</td>\n",
       "      <td>services</td>\n",
       "      <td>married</td>\n",
       "      <td>high.school</td>\n",
       "      <td>no</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>telephone</td>\n",
       "      <td>may</td>\n",
       "      <td>mon</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>999</td>\n",
       "      <td>0</td>\n",
       "      <td>nonexistent</td>\n",
       "      <td>1.1</td>\n",
       "      <td>93.994</td>\n",
       "      <td>-36.4</td>\n",
       "      <td>4.857</td>\n",
       "      <td>5191.0</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>40</td>\n",
       "      <td>admin.</td>\n",
       "      <td>married</td>\n",
       "      <td>basic.6y</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>telephone</td>\n",
       "      <td>may</td>\n",
       "      <td>mon</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>999</td>\n",
       "      <td>0</td>\n",
       "      <td>nonexistent</td>\n",
       "      <td>1.1</td>\n",
       "      <td>93.994</td>\n",
       "      <td>-36.4</td>\n",
       "      <td>4.857</td>\n",
       "      <td>5191.0</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>56</td>\n",
       "      <td>services</td>\n",
       "      <td>married</td>\n",
       "      <td>high.school</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>yes</td>\n",
       "      <td>telephone</td>\n",
       "      <td>may</td>\n",
       "      <td>mon</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>999</td>\n",
       "      <td>0</td>\n",
       "      <td>nonexistent</td>\n",
       "      <td>1.1</td>\n",
       "      <td>93.994</td>\n",
       "      <td>-36.4</td>\n",
       "      <td>4.857</td>\n",
       "      <td>5191.0</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 21 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   age        job  marital    education  default housing loan    contact  \\\n",
       "0   56  housemaid  married     basic.4y       no      no   no  telephone   \n",
       "1   57   services  married  high.school  unknown      no   no  telephone   \n",
       "2   37   services  married  high.school       no     yes   no  telephone   \n",
       "3   40     admin.  married     basic.6y       no      no   no  telephone   \n",
       "4   56   services  married  high.school       no      no  yes  telephone   \n",
       "\n",
       "  month day_of_week ...  campaign  pdays  previous     poutcome emp.var.rate  \\\n",
       "0   may         mon ...         1    999         0  nonexistent          1.1   \n",
       "1   may         mon ...         1    999         0  nonexistent          1.1   \n",
       "2   may         mon ...         1    999         0  nonexistent          1.1   \n",
       "3   may         mon ...         1    999         0  nonexistent          1.1   \n",
       "4   may         mon ...         1    999         0  nonexistent          1.1   \n",
       "\n",
       "   cons.price.idx  cons.conf.idx  euribor3m  nr.employed   y  \n",
       "0          93.994          -36.4      4.857       5191.0  no  \n",
       "1          93.994          -36.4      4.857       5191.0  no  \n",
       "2          93.994          -36.4      4.857       5191.0  no  \n",
       "3          93.994          -36.4      4.857       5191.0  no  \n",
       "4          93.994          -36.4      4.857       5191.0  no  \n",
       "\n",
       "[5 rows x 21 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Load the data\n",
    "Marketing=pd.read_csv('Data_FinalProject.csv')\n",
    "Marketing.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "loan     y  \n",
       "no       no     30100\n",
       "         yes     3850\n",
       "unknown  no       883\n",
       "         yes      107\n",
       "yes      no      5565\n",
       "         yes      683\n",
       "Name: y, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Marketing.groupby('loan').y.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "loan     marital   y  \n",
       "no       divorced  no      3420\n",
       "                   yes      396\n",
       "         married   no     18469\n",
       "                   yes     2098\n",
       "         single    no      8155\n",
       "                   yes     1345\n",
       "         unknown   no        56\n",
       "                   yes       11\n",
       "unknown  divorced  no       113\n",
       "                   yes        8\n",
       "         married   no       528\n",
       "                   yes       60\n",
       "         single    no       241\n",
       "                   yes       39\n",
       "         unknown   no         1\n",
       "yes      divorced  no       603\n",
       "                   yes       72\n",
       "         married   no      3399\n",
       "                   yes      374\n",
       "         single    no      1552\n",
       "                   yes      236\n",
       "         unknown   no        11\n",
       "                   yes        1\n",
       "Name: y, dtype: int64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Marketing.groupby(['loan','marital']).y.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Munging"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The first and most important step in using TPOT on any data set is to rename the target class/response variable to class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Marketing.rename(columns={'y': 'class'}, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "At present, TPOT requires all the data to be in numerical format. As we can see below, our data set has 11 categorical variables \n",
    "which contain non-numerical values: job, marital, education, default, housing, loan, contact, month, day_of_week, poutcome, class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age                 int64\n",
       "job                object\n",
       "marital            object\n",
       "education          object\n",
       "default            object\n",
       "housing            object\n",
       "loan               object\n",
       "contact            object\n",
       "month              object\n",
       "day_of_week        object\n",
       "duration            int64\n",
       "campaign            int64\n",
       "pdays               int64\n",
       "previous            int64\n",
       "poutcome           object\n",
       "emp.var.rate      float64\n",
       "cons.price.idx    float64\n",
       "cons.conf.idx     float64\n",
       "euribor3m         float64\n",
       "nr.employed       float64\n",
       "class              object\n",
       "dtype: object"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Marketing.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We then check the number of levels that each of the five categorical variables have."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of levels in category 'job': \b 12.00 \n",
      "Number of levels in category 'marital': \b 4.00 \n",
      "Number of levels in category 'education': \b 8.00 \n",
      "Number of levels in category 'default': \b 3.00 \n",
      "Number of levels in category 'housing': \b 3.00 \n",
      "Number of levels in category 'loan': \b 3.00 \n",
      "Number of levels in category 'contact': \b 2.00 \n",
      "Number of levels in category 'month': \b 10.00 \n",
      "Number of levels in category 'day_of_week': \b 5.00 \n",
      "Number of levels in category 'poutcome': \b 3.00 \n",
      "Number of levels in category 'class': \b 2.00 \n"
     ]
    }
   ],
   "source": [
    "for cat in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome' ,'class']:\n",
    "    print(\"Number of levels in category '{0}': \\b {1:2.2f} \".format(cat, Marketing[cat].unique().size))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "As we can see, contact and poutcome have few levels. Let's find out what they are."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Levels for catgeory 'contact': ['telephone' 'cellular']\n",
      "Levels for catgeory 'poutcome': ['nonexistent' 'failure' 'success']\n",
      "Levels for catgeory 'class': ['no' 'yes']\n",
      "Levels for catgeory 'marital': ['married' 'single' 'divorced' 'unknown']\n",
      "Levels for catgeory 'default': ['no' 'unknown' 'yes']\n",
      "Levels for catgeory 'housing': ['no' 'yes' 'unknown']\n",
      "Levels for catgeory 'loan': ['no' 'yes' 'unknown']\n"
     ]
    }
   ],
   "source": [
    "for cat in ['contact', 'poutcome','class', 'marital', 'default', 'housing', 'loan']:\n",
    "    print(\"Levels for catgeory '{0}': {1}\".format(cat, Marketing[cat].unique()))    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We then code these levels manually into numerical values. For nan i.e. the missing values, we simply replace them with a placeholder value (-999). In fact, we perform this replacement for the entire data set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "Marketing['marital'] = Marketing['marital'].map({'married':0,'single':1,'divorced':2,'unknown':3})\n",
    "Marketing['default'] = Marketing['default'].map({'no':0,'yes':1,'unknown':2})\n",
    "Marketing['housing'] = Marketing['housing'].map({'no':0,'yes':1,'unknown':2})\n",
    "Marketing['loan'] = Marketing['loan'].map({'no':0,'yes':1,'unknown':2})\n",
    "Marketing['contact'] = Marketing['contact'].map({'telephone':0,'cellular':1})\n",
    "Marketing['poutcome'] = Marketing['poutcome'].map({'nonexistent':0,'failure':1,'success':2})\n",
    "Marketing['class'] = Marketing['class'].map({'no':0,'yes':1})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "age               False\n",
       "job               False\n",
       "marital           False\n",
       "education         False\n",
       "default           False\n",
       "housing           False\n",
       "loan              False\n",
       "contact           False\n",
       "month             False\n",
       "day_of_week       False\n",
       "duration          False\n",
       "campaign          False\n",
       "pdays             False\n",
       "previous          False\n",
       "poutcome          False\n",
       "emp.var.rate      False\n",
       "cons.price.idx    False\n",
       "cons.conf.idx     False\n",
       "euribor3m         False\n",
       "nr.employed       False\n",
       "class             False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Marketing = Marketing.fillna(-999)\n",
    "pd.isnull(Marketing).any()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For other categorical variables, we encode the levels as digits using Scikit-learn's MultiLabelBinarizer and treat them as new features."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MultiLabelBinarizer\n",
    "mlb = MultiLabelBinarizer()\n",
    "\n",
    "job_Trans = mlb.fit_transform([{str(val)} for val in Marketing['job'].values])\n",
    "education_Trans = mlb.fit_transform([{str(val)} for val in Marketing['education'].values])\n",
    "month_Trans = mlb.fit_transform([{str(val)} for val in Marketing['month'].values])\n",
    "day_of_week_Trans = mlb.fit_transform([{str(val)} for val in Marketing['day_of_week'].values])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0, 1, 0, 0, 0],\n",
       "       [0, 1, 0, 0, 0],\n",
       "       [0, 1, 0, 0, 0],\n",
       "       ..., \n",
       "       [1, 0, 0, 0, 0],\n",
       "       [1, 0, 0, 0, 0],\n",
       "       [1, 0, 0, 0, 0]])"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "day_of_week_Trans"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Drop the unused features from the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "marketing_new = Marketing.drop(['marital','default','housing','loan','contact','poutcome','class','job','education','month','day_of_week'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "assert (len(Marketing['day_of_week'].unique()) == len(mlb.classes_)), \"Not Equal\" #check correct encoding done"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['mon', 'tue', 'wed', 'thu', 'fri'], dtype=object),\n",
       " array(['fri', 'mon', 'thu', 'tue', 'wed'], dtype=object))"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Marketing['day_of_week'].unique(),mlb.classes_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We then add the encoded features to form the final dataset to be used with TPOT."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "marketing_new = np.hstack((marketing_new.values, job_Trans, education_Trans, month_Trans, day_of_week_Trans))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.isnan(marketing_new).any()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Keeping in mind that the final dataset is in the form of a numpy array, we can check the number of features in the final dataset as follows."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "45"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "marketing_new[0].size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Finally we store the class labels, which we need to predict, in a separate variable."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "marketing_class = Marketing['class'].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Analysis using TPOT"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To begin our analysis, we need to divide our training data into training and validation sets. The validation set is just to give us an idea of the test set error. The model selection and tuning is entirely taken care of by TPOT, so if we want to, we can skip creating this validation set."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(30891, 10297)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_indices, validation_indices = training_indices, testing_indices = train_test_split(Marketing.index, stratify = marketing_class, train_size=0.75, test_size=0.25)\n",
    "training_indices.size, validation_indices.size"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After that, we proceed to calling the `fit()`, `score()` and `export()` functions on our training dataset.\n",
    "An important TPOT parameter to set is the number of generations (via the `generations` kwarg). Since our aim is to just illustrate the use of TPOT, we assume the default setting of 100 generations, whilst bounding the total running time via the `max_time_mins` kwarg (which may, essentially, override the former setting). Further, we enable control for the maximum amount of time allowed for optimization of a single pipeline, via `max_eval_time_mins`.\n",
    "\n",
    "On a standard laptop with 4GB RAM, each generation takes approximately 5 minutes to run. Thus, for the default value of 100, without the explicit duration bound, the total run time could be roughly around 8 hours."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Warning: xgboost.XGBClassifier is not available and will not be used by TPOT.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 49pipeline [00:46,  1.10s/pipeline]                  "
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 1 - Current best internal CV score: 0.913728927925\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 71pipeline [01:11,  1.06s/pipeline]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 2 - Current best internal CV score: 0.913728927925\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 95pipeline [01:38,  1.28s/pipeline]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 3 - Current best internal CV score: 0.913728927925\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Optimization Progress: 115pipeline [01:58,  1.04s/pipeline]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generation 4 - Current best internal CV score: 0.913728927925\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "                                                           \r"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "2.00407131667 minutes have elapsed. TPOT will close down.\n",
      "TPOT closed prematurely. Will use the current best pipeline.\n",
      "\n",
      "Best pipeline: DecisionTreeClassifier(input_matrix, criterion=gini, max_depth=5, min_samples_leaf=16, min_samples_split=8)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TPOTClassifier(config_dict={'sklearn.ensemble.GradientBoostingClassifier': {'max_features': array([ 0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,  0.35,  0.4 ,  0.45,\n",
       "        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,\n",
       "        0.95,  1.  ]), 'learning_rate': [0.001, 0.01, 0.1, 0.5, 1.0], 'min_samples_... 0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,\n",
       "        0.95,  1.  ])}, 'sklearn.preprocessing.RobustScaler': {}},\n",
       "        crossover_rate=0.1, cv=5, disable_update_check=False,\n",
       "        early_stop=None, generations=1000000, max_eval_time_mins=0.04,\n",
       "        max_time_mins=2, mutation_rate=0.9, n_jobs=1, offspring_size=15,\n",
       "        periodic_checkpoint_folder=None, population_size=15,\n",
       "        random_state=None, scoring=None, subsample=1.0, verbosity=2,\n",
       "        warm_start=False)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpot = TPOTClassifier(verbosity=2, max_time_mins=2, max_eval_time_mins=0.04, population_size=15)\n",
    "tpot.fit(marketing_new[training_indices], marketing_class[training_indices])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In the above, 4 generations were computed, each giving the training efficiency of fitting model on the training set. As evident, the best pipeline is the one that has the CV score of 91.373%. The model that produces this result is one that fits a decision tree algorithm on the data set. Next, the test error is computed for validation purposes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.91628629697970287"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpot.score(marketing_new[validation_indices], Marketing.loc[validation_indices, 'class'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tpot.export('tpot_marketing_pipeline.py')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# %load tpot_marketing_pipeline.py\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "\n",
    "# NOTE: Make sure that the outcome column is labeled 'target' in the data file\n",
    "tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)\n",
    "features = tpot_data.drop('target', axis=1)\n",
    "training_features, testing_features, training_target, testing_target = \\\n",
    "            train_test_split(features, tpot_data['target'], random_state=None)\n",
    "\n",
    "# Average CV score on the training set was:0.913728927925\n",
    "exported_pipeline = DecisionTreeClassifier(criterion=\"gini\", max_depth=5, min_samples_leaf=16, min_samples_split=8)\n",
    "\n",
    "exported_pipeline.fit(training_features, training_target)\n",
    "results = exported_pipeline.predict(testing_features)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
